This document contains code that is too long to display in the slides.
You should use this supplement to follow along and try executing the example code yourself.
When you execute a statement like x <- 5 in R, you are creating an object in memory which holds the numeric value 5 and is referenced by the variable name “x”.
If you later ask R to do something like y <- x + 2, it will search sequentially through a series of namespaces until it finds a variable called “x”. Namespaces can be thoguht of as collections of labels pointing to places in memory. You can use R’s search() command to examine the ordered list of namespaces that R will search in for variables.
# Check the search path of namespaces
search()
## [1] ".GlobalEnv" "package:stats" "package:graphics"
## [4] "package:grDevices" "package:utils" "package:datasets"
## [7] "package:methods" "Autoloads" "package:base"
# use ls() to list the objects in one of those namespaces
ls("package:stats")
## [1] "acf" "acf2AR" "add.scope"
## [4] "add1" "addmargins" "aggregate"
## [7] "aggregate.data.frame" "aggregate.ts" "AIC"
## [10] "alias" "anova" "ansari.test"
## [13] "aov" "approx" "approxfun"
## [16] "ar" "ar.burg" "ar.mle"
## [19] "ar.ols" "ar.yw" "arima"
## [22] "arima.sim" "arima0" "arima0.diag"
## [25] "ARMAacf" "ARMAtoMA" "as.dendrogram"
## [28] "as.dist" "as.formula" "as.hclust"
## [31] "as.stepfun" "as.ts" "asOneSidedFormula"
## [34] "ave" "bandwidth.kernel" "bartlett.test"
## [37] "BIC" "binom.test" "binomial"
## [40] "biplot" "Box.test" "bw.bcv"
## [43] "bw.nrd" "bw.nrd0" "bw.SJ"
## [46] "bw.ucv" "C" "cancor"
## [49] "case.names" "ccf" "chisq.test"
## [52] "cmdscale" "coef" "coefficients"
## [55] "complete.cases" "confint" "confint.default"
## [58] "confint.lm" "constrOptim" "contr.helmert"
## [61] "contr.poly" "contr.SAS" "contr.sum"
## [64] "contr.treatment" "contrasts" "contrasts<-"
## [67] "convolve" "cooks.distance" "cophenetic"
## [70] "cor" "cor.test" "cov"
## [73] "cov.wt" "cov2cor" "covratio"
## [76] "cpgram" "cutree" "cycle"
## [79] "D" "dbeta" "dbinom"
## [82] "dcauchy" "dchisq" "decompose"
## [85] "delete.response" "deltat" "dendrapply"
## [88] "density" "density.default" "deriv"
## [91] "deriv3" "deviance" "dexp"
## [94] "df" "df.kernel" "df.residual"
## [97] "DF2formula" "dfbeta" "dfbetas"
## [100] "dffits" "dgamma" "dgeom"
## [103] "dhyper" "diffinv" "dist"
## [106] "dlnorm" "dlogis" "dmultinom"
## [109] "dnbinom" "dnorm" "dpois"
## [112] "drop.scope" "drop.terms" "drop1"
## [115] "dsignrank" "dt" "dummy.coef"
## [118] "dummy.coef.lm" "dunif" "dweibull"
## [121] "dwilcox" "ecdf" "eff.aovlist"
## [124] "effects" "embed" "end"
## [127] "estVar" "expand.model.frame" "extractAIC"
## [130] "factanal" "factor.scope" "family"
## [133] "fft" "filter" "fisher.test"
## [136] "fitted" "fitted.values" "fivenum"
## [139] "fligner.test" "formula" "frequency"
## [142] "friedman.test" "ftable" "Gamma"
## [145] "gaussian" "get_all_vars" "getCall"
## [148] "getInitial" "glm" "glm.control"
## [151] "glm.fit" "hasTsp" "hat"
## [154] "hatvalues" "hclust" "heatmap"
## [157] "HoltWinters" "influence" "influence.measures"
## [160] "integrate" "interaction.plot" "inverse.gaussian"
## [163] "IQR" "is.empty.model" "is.leaf"
## [166] "is.mts" "is.stepfun" "is.ts"
## [169] "is.tskernel" "isoreg" "KalmanForecast"
## [172] "KalmanLike" "KalmanRun" "KalmanSmooth"
## [175] "kernapply" "kernel" "kmeans"
## [178] "knots" "kruskal.test" "ks.test"
## [181] "ksmooth" "lag" "lag.plot"
## [184] "line" "lm" "lm.fit"
## [187] "lm.influence" "lm.wfit" "loadings"
## [190] "loess" "loess.control" "loess.smooth"
## [193] "logLik" "loglin" "lowess"
## [196] "ls.diag" "ls.print" "lsfit"
## [199] "mad" "mahalanobis" "make.link"
## [202] "makeARIMA" "makepredictcall" "manova"
## [205] "mantelhaen.test" "mauchly.test" "mcnemar.test"
## [208] "median" "median.default" "medpolish"
## [211] "model.extract" "model.frame" "model.frame.default"
## [214] "model.matrix" "model.matrix.default" "model.matrix.lm"
## [217] "model.offset" "model.response" "model.tables"
## [220] "model.weights" "monthplot" "mood.test"
## [223] "mvfft" "na.action" "na.contiguous"
## [226] "na.exclude" "na.fail" "na.omit"
## [229] "na.pass" "napredict" "naprint"
## [232] "naresid" "nextn" "nlm"
## [235] "nlminb" "nls" "nls.control"
## [238] "NLSstAsymptotic" "NLSstClosestX" "NLSstLfAsymptote"
## [241] "NLSstRtAsymptote" "nobs" "numericDeriv"
## [244] "offset" "oneway.test" "optim"
## [247] "optimHess" "optimise" "optimize"
## [250] "order.dendrogram" "p.adjust" "p.adjust.methods"
## [253] "pacf" "Pair" "pairwise.prop.test"
## [256] "pairwise.t.test" "pairwise.table" "pairwise.wilcox.test"
## [259] "pbeta" "pbinom" "pbirthday"
## [262] "pcauchy" "pchisq" "pexp"
## [265] "pf" "pgamma" "pgeom"
## [268] "phyper" "plclust" "plnorm"
## [271] "plogis" "plot.ecdf" "plot.spec.coherency"
## [274] "plot.spec.phase" "plot.stepfun" "plot.ts"
## [277] "pnbinom" "pnorm" "poisson"
## [280] "poisson.test" "poly" "polym"
## [283] "power" "power.anova.test" "power.prop.test"
## [286] "power.t.test" "PP.test" "ppoints"
## [289] "ppois" "ppr" "prcomp"
## [292] "predict" "predict.glm" "predict.lm"
## [295] "preplot" "princomp" "printCoefmat"
## [298] "profile" "proj" "promax"
## [301] "prop.test" "prop.trend.test" "psignrank"
## [304] "pt" "ptukey" "punif"
## [307] "pweibull" "pwilcox" "qbeta"
## [310] "qbinom" "qbirthday" "qcauchy"
## [313] "qchisq" "qexp" "qf"
## [316] "qgamma" "qgeom" "qhyper"
## [319] "qlnorm" "qlogis" "qnbinom"
## [322] "qnorm" "qpois" "qqline"
## [325] "qqnorm" "qqplot" "qsignrank"
## [328] "qt" "qtukey" "quade.test"
## [331] "quantile" "quasi" "quasibinomial"
## [334] "quasipoisson" "qunif" "qweibull"
## [337] "qwilcox" "r2dtable" "rbeta"
## [340] "rbinom" "rcauchy" "rchisq"
## [343] "read.ftable" "rect.hclust" "reformulate"
## [346] "relevel" "reorder" "replications"
## [349] "reshape" "resid" "residuals"
## [352] "residuals.glm" "residuals.lm" "rexp"
## [355] "rf" "rgamma" "rgeom"
## [358] "rhyper" "rlnorm" "rlogis"
## [361] "rmultinom" "rnbinom" "rnorm"
## [364] "rpois" "rsignrank" "rstandard"
## [367] "rstudent" "rt" "runif"
## [370] "runmed" "rweibull" "rwilcox"
## [373] "rWishart" "scatter.smooth" "screeplot"
## [376] "sd" "se.contrast" "selfStart"
## [379] "setNames" "shapiro.test" "sigma"
## [382] "simulate" "smooth" "smooth.spline"
## [385] "smoothEnds" "sortedXyData" "spec.ar"
## [388] "spec.pgram" "spec.taper" "spectrum"
## [391] "spline" "splinefun" "splinefunH"
## [394] "SSasymp" "SSasympOff" "SSasympOrig"
## [397] "SSbiexp" "SSD" "SSfol"
## [400] "SSfpl" "SSgompertz" "SSlogis"
## [403] "SSmicmen" "SSweibull" "start"
## [406] "stat.anova" "step" "stepfun"
## [409] "stl" "StructTS" "summary.aov"
## [412] "summary.glm" "summary.lm" "summary.manova"
## [415] "summary.stepfun" "supsmu" "symnum"
## [418] "t.test" "termplot" "terms"
## [421] "terms.formula" "time" "toeplitz"
## [424] "ts" "ts.intersect" "ts.plot"
## [427] "ts.union" "tsdiag" "tsp"
## [430] "tsp<-" "tsSmooth" "TukeyHSD"
## [433] "uniroot" "update" "update.default"
## [436] "update.formula" "var" "var.test"
## [439] "variable.names" "varimax" "vcov"
## [442] "weighted.mean" "weighted.residuals" "weights"
## [445] "wilcox.test" "window" "window<-"
## [448] "write.ftable" "xtabs"
# Addition with "+"
4 + 5
## [1] 9
# Subtraction with "-"
100 - 99
## [1] 1
# Multiplication with "*"
4 * 5
## [1] 20
# Division with "/"
15 / 3
## [1] 5
# Exponentiation with "^"
2^3
## [1] 8
# Order of Operations
4 * 5 + 5 / 5
## [1] 21
# Control with parentheses
4 * (5 + 5) / 5
## [1] 8
# Ordered sequence of integers
1:5
## [1] 1 2 3 4 5
# Counting by 2s
seq(from = 0, to = 14, by = 2)
## [1] 0 2 4 6 8 10 12 14
# Replicate the same values
rep(TRUE, 6)
## [1] TRUE TRUE TRUE TRUE TRUE TRUE
# Concatenate multiple values with the "c" operator
c("all", "of", "the", "lights")
## [1] "all" "of" "the" "lights"
# Watch out! Mixing types wil lead to silent coercion
c(1, TRUE, "hellos")
## [1] "1" "TRUE" "hellos"
# Some functions, when applied over a vector, return a single value
is.numeric(rnorm(100))
## [1] TRUE
# Others will return a vector of results
is.na(c(1, 5, 10, NA, 8))
## [1] FALSE FALSE FALSE TRUE FALSE
# Vectors can be named
batting_avg <- c(
youkilis = 0.300
, ortiz = 0.355
, nixon = 0.285
)
print(batting_avg)
## youkilis ortiz nixon
## 0.300 0.355 0.285
# You can combine two vectors with c()
x <- c("a", "b", "c")
y <- c("1", "2", "3")
c(x, y)
## [1] "a" "b" "c" "1" "2" "3"
Vectors are the first multi-item data structure all R programmers learn. Soon, though, you may find yourself frustrated with the fact that they can only hold a single type. To handle casses where you want to package multiple types (and even multiple objects!) together, we will turn to a data structure called a list.
| Capabilities | Vectors | Lists |
|---|---|---|
| Optional use of named elements | ✔ | ✔ |
| Support math operations like mean() | ✔ | |
| Hold multiple types | ✔ | |
| Hold multiple objects | ✔ |
# Create a list with list()
myList <- list(
a = 1
, b = rep(TRUE, 10)
, x = c("shakezoola", "mic", "rulah")
)
# Examine it with str()
str(myList)
## List of 3
## $ a: num 1
## $ b: logi [1:10] TRUE TRUE TRUE TRUE TRUE TRUE ...
## $ x: chr [1:3] "shakezoola" "mic" "rulah"
Imagine that you want to build a model of the relationship between resource wealth and quality-of-life outcomes like life expectancy. You got out to the World Bank to grab some data, and the dataset you get includes a column called “region” with values like “Africa”, “European Union”, and “South America”. How can you use this variable in a model or for generating region-by-region summary stats? This is where R’s factor type comes in.
regions <- c("Africa", "European Union", "South America", "South America", "European Union")
region_fac <- as.factor(regions)
str(regions)
## chr [1:5] "Africa" "European Union" "South America" "South America" ...
str(region_fac)
## Factor w/ 3 levels "Africa","European Union",..: 1 2 3 3 2
What does it mean for region to be a factor? Essentially, a factor is a categorical variable. R uses a cool trick to save memory when storing factors…internally, R will convert factor values to integers and then keep aroudn a single table the tells it, e.g., that 1 = “Africa”, 2 = “European Union”, etc..
# Check it out! R has assigned integer values to the "region_fac" variable
as.integer(region_fac)
## [1] 1 2 3 3 2
# But you can also access the character values if you want
as.character(region_fac)
## [1] "Africa" "European Union" "South America" "South America"
## [5] "European Union"
# For more, see:
str(region_fac)
## Factor w/ 3 levels "Africa","European Union",..: 1 2 3 3 2
levels(region_fac)
## [1] "Africa" "European Union" "South America"
Vectors and lists are crucial data structures in R, but you may find that they’re difficult to work with in model training and other data science tasks. It is now time to introduce a third foundational data structure: the data frame.
Data frames are tables of data. Each column of a data frame can be a different type, but all values within a column must be the same type.
# Create a data frame!
myDF <- data.frame(
time_period = c(1, 2, 3, 4, 5)
, temperature = c(25.6, 38.7, 31.4, 40.0, 29.20)
, station = c("A", "B", "A", "A", "B")
, is_gov_owned = c(TRUE, FALSE, TRUE, TRUE, FALSE)
)
# Check out the structure of this thing
str(myDF)
## 'data.frame': 5 obs. of 4 variables:
## $ time_period : num 1 2 3 4 5
## $ temperature : num 25.6 38.7 31.4 40 29.2
## $ station : chr "A" "B" "A" "A" ...
## $ is_gov_owned: logi TRUE FALSE TRUE TRUE FALSE
R comes with some sample data sets you can experiment with. Let’s load the mtcars data.frame and test out some new commands!
# Load the mtcars data frame
data("mtcars")
# Check out its structure
str(mtcars)
## 'data.frame': 32 obs. of 11 variables:
## $ mpg : num 21 21 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 ...
## $ cyl : num 6 6 4 6 8 6 8 4 4 6 ...
## $ disp: num 160 160 108 258 360 ...
## $ hp : num 110 110 93 110 175 105 245 62 95 123 ...
## $ drat: num 3.9 3.9 3.85 3.08 3.15 2.76 3.21 3.69 3.92 3.92 ...
## $ wt : num 2.62 2.88 2.32 3.21 3.44 ...
## $ qsec: num 16.5 17 18.6 19.4 17 ...
## $ vs : num 0 0 1 1 0 1 0 1 1 1 ...
## $ am : num 1 1 1 0 0 0 0 0 0 0 ...
## $ gear: num 4 4 4 3 3 3 3 4 4 4 ...
## $ carb: num 4 4 1 1 2 1 4 2 2 4 ...
# View the top 10 rows
head(mtcars, n = 10)
## mpg cyl disp hp drat wt qsec vs am gear carb
## Mazda RX4 21.0 6 160.0 110 3.90 2.620 16.46 0 1 4 4
## Mazda RX4 Wag 21.0 6 160.0 110 3.90 2.875 17.02 0 1 4 4
## Datsun 710 22.8 4 108.0 93 3.85 2.320 18.61 1 1 4 1
## Hornet 4 Drive 21.4 6 258.0 110 3.08 3.215 19.44 1 0 3 1
## Hornet Sportabout 18.7 8 360.0 175 3.15 3.440 17.02 0 0 3 2
## Valiant 18.1 6 225.0 105 2.76 3.460 20.22 1 0 3 1
## Duster 360 14.3 8 360.0 245 3.21 3.570 15.84 0 0 3 4
## Merc 240D 24.4 4 146.7 62 3.69 3.190 20.00 1 0 4 2
## Merc 230 22.8 4 140.8 95 3.92 3.150 22.90 1 0 4 2
## Merc 280 19.2 6 167.6 123 3.92 3.440 18.30 1 0 4 4
# View the bottom 10 rows
tail(mtcars, n = 10)
## mpg cyl disp hp drat wt qsec vs am gear carb
## AMC Javelin 15.2 8 304.0 150 3.15 3.435 17.30 0 0 3 2
## Camaro Z28 13.3 8 350.0 245 3.73 3.840 15.41 0 0 3 4
## Pontiac Firebird 19.2 8 400.0 175 3.08 3.845 17.05 0 0 3 2
## Fiat X1-9 27.3 4 79.0 66 4.08 1.935 18.90 1 1 4 1
## Porsche 914-2 26.0 4 120.3 91 4.43 2.140 16.70 0 1 5 2
## Lotus Europa 30.4 4 95.1 113 3.77 1.513 16.90 1 1 5 2
## Ford Pantera L 15.8 8 351.0 264 4.22 3.170 14.50 0 1 5 4
## Ferrari Dino 19.7 6 145.0 175 3.62 2.770 15.50 0 1 5 6
## Maserati Bora 15.0 8 301.0 335 3.54 3.570 14.60 0 1 5 8
## Volvo 142E 21.4 4 121.0 109 4.11 2.780 18.60 1 1 4 2
Often in your code, you’ll want to do/not do something or select / not select some data based on a logical condition (a statement that evaluates to TRUE or FALSE). Here are some examples of how to construct these statements in R.
# "and" logic is expressed with "&"
TRUE & TRUE # TRUE
## [1] TRUE
TRUE & FALSE # FALSE
## [1] FALSE
FALSE & FALSE # FALSE
## [1] FALSE
-5 < 5 & 3 > 2 # TRUE
## [1] TRUE
# "or" logic is expressed with "|"
TRUE | TRUE # TRUE
## [1] TRUE
TRUE | FALSE # TRUE
## [1] TRUE
FALSE | FALSE # FALSE
## [1] FALSE
3 < 8 | 8 > 19 # TRUE
## [1] TRUE
The most common operators used to generate logicals are >, <, ==, and !=
# "equality" logic is specified with "=="
3 == 3 # TRUE
## [1] TRUE
4 == 4.1 # FALSE
## [1] FALSE
# "not" logic is specified with !. In a special case, != signifies "not equal"
!TRUE # FALSE
## [1] FALSE
!FALSE # TRUE
## [1] TRUE
! (TRUE | FALSE) # FALSE
## [1] FALSE
4 != 5 # TRUE
## [1] TRUE
# "greater than" and "less than" logic are specified in the way you might expect
5 < 5 # FALSE
## [1] FALSE
6 <= 6 # TRUE
## [1] TRUE
4 > 2 # TRUE
## [1] TRUE
3 >= 3 # TRUE
## [1] TRUE
You can use vectors of logicals (TRUE and FALSE) to subset other objects. As a general rule, when you put a vector on the left-hand side of a logical condition like == or >, you will get back a vector as a result.
vehicleSizes <- c(1, 5, 5, 2, 4)
# Create a logical index. Note that we get a VECTOR of logicals back
bigCarIndex <- vehicleSizes > 4
# Taking the SUM of a logical vector tells you the number of TRUEs.
sum(bigCarIndex)
## [1] 2
# Taking the MEAN of a logical vector tells you the proportion of TRUEs
mean(bigCarIndex)
## [1] 0.4
Subsetting is the act of retrieving a portion of an object, usually based on some logical condition (e.g. “all elements greater than 5”). In R, this is done with the [ operator.
myVec <- c(
var1 = 10
, var2 = 15
, var3 = 20
, av4 = 6
)
# "the first element"
myVec[1]
## var1
## 10
# "second to fourth element"
myVec[c(2, 3, 4)]
## var2 var3 av4
## 15 20 6
# "second to fourth elements"
myVec[2:4]
## var2 var3 av4
## 15 20 6
# "the element named var3"
myVec["var3"]
## var3
## 20
# "the elements named var1 and var3"
myVec[c("var1", "var3")]
## var1 var3
## 10 20
# "third element, then the fourth one repeated twice"
myVec[c(3, 4, 4)]
## var3 av4 av4
## 20 6 6
Lists, arbitrary collections of (potentially heterogeneous) R objects, are subsetted a bit differently than vectors. If you use [ with a list, you are guaranteed to get back another list. If you use [[, you will get back an object in its natural form (whatever it would look like if it wasn’t in the list). You can also use $ to access named elements.
# Set up a sample list object
someList <- list(
country = "DEU"
, region = "EU"
, econ_stats = list(
gdp_growth = 1.5
, rm10y = 2.75
, DAXmonthly = c(1000, 1100, 1050, 1200, 1500)
)
)
# Examine the list structure
str(someList)
## List of 3
## $ country : chr "DEU"
## $ region : chr "EU"
## $ econ_stats:List of 3
## ..$ gdp_growth: num 1.5
## ..$ rm10y : num 2.75
## ..$ DAXmonthly: num [1:5] 1000 1100 1050 1200 1500
# Grab the country name
country1 <- someList[["country"]]
country2 <- someList$country
class(country1)
## [1] "character"
class(country2)
## [1] "character"
identical(country1, country2)
## [1] TRUE
# Grab the economic stats as a list within a list
germanyStats <- someList["econ_stats"]
class(germanyStats)
## [1] "list"
# Grab the numeric vector of stock prices
daxPrices <- someList$econ_stats$DAXmonthly
class(daxPrices)
## [1] "numeric"
# Another way to parse down the list for stock prices
someList[["econ_stats"]][["DAXmonthly"]]
## [1] 1000 1100 1050 1200 1500
Data frames are the workhorse data structure of statistics in R. The best way to learn data frame subsetting is to just walk through the examples below:
# Create a data frame
someDF <- data.frame(
conference = c("Big East", "Big Ten", "Big East", "ACC", "SEC")
, school_name = c("Villanova", "Minnesota", "Marquette", "Duke", "LSU")
, wins = c(18, 14, 19, 24, 12)
, ppg = c(71.5, 45.8, 66.9, 83.4, 58.7)
)
# Grab the wins column (NOTE: will give you back a vector)
someDF[, "wins"]
## [1] 18 14 19 24 12
someDF[["wins"]]
## [1] 18 14 19 24 12
# Grab conference and wins columns
someDF[, c("conference", "wins")]
## conference wins
## 1 Big East 18
## 2 Big Ten 14
## 3 Big East 19
## 4 ACC 24
## 5 SEC 12
# Grab the first 3 rows and the two numeric columns
someDF[1:3, c("wins", "ppg")]
## wins ppg
## 1 18 71.5
## 2 14 45.8
## 3 19 66.9
So far, we’ve seen how to subset R objects using numeric indices and named elements. These are useful approaches, but both require you to know something about the contents of the obejct you’re working with.
Using these methods (especially numeric indices like saying give me columns 2-4) can make your code confusing and hard for others to reason about. Wherever possible, I strongly recommend using logical vectors for subsetting. This makes your code intuitive and more flexible to change.
# Some data frame
playerDF <- data.frame(
playerName = c("Rajon Rondo", "Paul Pierce", "Kevin Garnett", "Ray Allen", "Big Baby")
, position = c("PG", "SF", "PF", "SG", "C")
, apg = c(14.2, 3.5, 4.6, 5.0, 0.7)
, ppg = c(4.8, 31.0, 24.6, 33.2, 12.3)
, fg_percent = c(-.062, 0.48, 0.63, 0.80, 0.51)
)
# "Give me a data frame with all Rondo and KG's stats"
nameIndex <- playerDF$playerName %in% c("Rajon Rondo", "Kevin Garnett")
nameIndex
## [1] TRUE FALSE TRUE FALSE FALSE
class(nameIndex)
## [1] "logical"
playerDF[nameIndex,]
## playerName position apg ppg fg_percent
## 1 Rajon Rondo PG 14.2 4.8 -0.062
## 3 Kevin Garnett PF 4.6 24.6 0.630
# "Give me apg for players who averaged more than 20 ppg"
playerDF[playerDF$ppg > 20, c("playerName", "apg")]
## playerName apg
## 2 Paul Pierce 3.5
## 3 Kevin Garnett 4.6
## 4 Ray Allen 5.0
# "Give me stats for all the guards""
guardIndex <- grepl("G", playerDF$position)
print(guardIndex)
## [1] TRUE FALSE FALSE TRUE FALSE
playerDF[grepl("G", playerDF$position)]
## playerName ppg
## 1 Rajon Rondo 4.8
## 2 Paul Pierce 31.0
## 3 Kevin Garnett 24.6
## 4 Ray Allen 33.2
## 5 Big Baby 12.3
# "Give me stats for players who were guards OR show above 50% from the field"
playerDF[grepl("G", playerDF$position) | playerDF$fg_percent > 0.5,]
## playerName position apg ppg fg_percent
## 1 Rajon Rondo PG 14.2 4.8 -0.062
## 3 Kevin Garnett PF 4.6 24.6 0.630
## 4 Ray Allen SG 5.0 33.2 0.800
## 5 Big Baby C 0.7 12.3 0.510
Soon after you start writing code (in any language), you’ll inevitably find yourself saying “I only want to do this thing if certain conditions are met”. This type of logic is expressed using if-else syntax
DAY <- "SATURDAY"
print(DAY)
## [1] "SATURDAY"
# If it's Monday, print 2 Gauss facts. Otherwise, print 1
if (DAY == "MONDAY"){
print("message 1")
print("message 2")
} else {
print("other message")
}
## [1] "other message"
What if you want to express more than two possible outcomes? For this, we could use R’s else if construct to nest conditions. Note that conditional blocks can have any number of “else if” statements, but only one “else” block.
# Try to think through what this will do before you run it yourself:
if (4 > 5){
print("3")
} else if (6 <= (5/10)) {
print("1")
} else if (4 + 4 + 4 == 12.0001) {
print("4")
} else {
print("2")
}
## [1] "2"
One of the most powerful characteristics of general purpose programming languages is their ability to automate repetitive tasks. When you know that you want to do something a fixed number of times (say, squaring each item in a vector), you can use a for loop.
# Create a vector
x <- c(1,4,6)
# Print the square of each element one at a time
print(1^2)
## [1] 1
print(4^2)
## [1] 16
print(6^2)
## [1] 36
# BETTER: Loop over the vector and print the square of each element
for (some_number in x){
print(some_number^2)
}
## [1] 1
## [1] 16
## [1] 36
For loops are suitable for many applications, but they can be too restrictive in some cases. For example, imagine that you are writing a simple movie search engine and you want to tell R “look through an alphabetized list of movies and tell me if you find Apocalypse Now”. A for loop can certainly do this, but it will keep running over ALL movies…long after it finds Ace Ventura! This is a great place to use a while loop.
Here is the for loop implementation:
movies <- c(
"ace ventura"
, "apocalypse now"
, "return of the jedi"
, "v for vendetta"
, "zoolander"
)
MOVIE_TO_SEARCH_FOR <- "apocalypse now"
# Naive for loop implementation
i <- 1
for (movie in movies) {
if (movie == MOVIE_TO_SEARCH_FOR) {
print(paste0(i, ": found it!"))
} else {
print(paste0(i, ": not found"))
}
i <- i + 1
}
## [1] "1: not found"
## [1] "2: found it!"
## [1] "3: not found"
## [1] "4: not found"
## [1] "5: not found"
And here is the while loop implementation. Notice that this one stops checking once it finds what it wants. In this case, it’s more efficient to use a while loop to solve this problem.
movies <- c(
"ace ventura"
, "apocalypse now"
, "return of the jedi"
, "v for vendetta"
, "zoolander"
)
MOVIE_TO_SEARCH_FOR <- "apocalypse now"
# Faster while loop implementation
KEEP_SEARCHING <- TRUE
i <- 1
while (KEEP_SEARCHING){
# Check this element
if (movies[i] == MOVIE_TO_SEARCH_FOR) {
print(paste0(i, ": found it!"))
KEEP_SEARCHING <- FALSE
} else {
print(paste0(i, ": not found"))
i <- i + 1
}
# If we've reached the end, break out. Otherwise, increment the counter
if (i == length(movies)){
print("Done searching. This movie isn't in the list")
KEEP_SEARCHING <- FALSE
}
}
## [1] "1: not found"
## [1] "2: found it!"
R is a functional programming language. To write powerful, concise code, you’ll need to master the use and creation of functions.
“If you find yourself copying and pasting the same code more than twice, it’s time to write a function.” - Hadley Wickham
# Function to return only the numbers smaller than 10
nums <- c(1, 3, 4, 8, 13, 24)
getLittleNumbers <- function(some_numbahs){
lil_ones <- some_numbahs[some_numbahs < 10]
return(lil_ones)
}
getLittleNumbers(nums)
## [1] 1 3 4 8
When you call a function with arguments in R, you can provide those arguments two ways:
someFunction <- function(x, y, z){
print(paste0("x: ", x))
print(paste0("y: ", y))
print(paste0("z: ", z))
}
If you don’t use the = sign in the function call, all argument values are matched positionally. The first value is assigned to the first argument, the second to the second, and so on.
someFunction(4, 8, 10)
## [1] "x: 4"
## [1] "y: 8"
## [1] "z: 10"
If you pass all keyword arguments, you can pass them in any order. All of the function calls below have the same result.
someFunction(x = 1, y = -1, z = 3)
## [1] "x: 1"
## [1] "y: -1"
## [1] "z: 3"
someFunction(x = 1, z = 3, y = -1)
## [1] "x: 1"
## [1] "y: -1"
## [1] "z: 3"
someFunction(z = 3, y = -1, x = 1)
## [1] "x: 1"
## [1] "y: -1"
## [1] "z: 3"
If you pass a mix of positional and keyword, all of the keyword arguments are matched first. Once they’ve all been matched, the positional values are assigned to the remaining unmatched arguments in order.
someFunction(10, 100, x = -5)
## [1] "x: -5"
## [1] "y: 10"
## [1] "z: 100"
?sqrt. You’ll see that it takes one argument, named x. You can pass any vector of numeric values to this argument and sqrt() will return the square root of each elementx is a required argument of sqrt()# Take the square root of a vector of numbers
sqrt(x = c(1, 4, 9, 16, 25))
## [1] 1 2 3 4 5
# Note that calling this function without the argument will throw an error!
sqrt()
## Error in sqrt(): 0 arguments passed to 'sqrt' which requires 1
?rnorm. You’ll see that this function’s signature reads rnorm(n, mean = 0, sd = 1).# 100 random draws from a normal distribution w/ mean 0 and standard deviation 1
rand_nums <- rnorm(n = 100)
# 100 random draws from a normal distribution w/ mean 4.5 and standard deviation 1
rand_nums <- rnorm(n = 100, mean = 4.5)
As you’ve seen in previous examples, the R special word return tells a function to “give back” some value. When you execute an expression like x <- someFunction(), that function’s return value (an R object) is stored a variable called “x”.
Unlike in some other programming languages, R allows you to use multiple return values inside the body of a function. The first time that the code inside the function reaches a return value, it will pass that value back out of the function and immediately stop executing the function.
# simple function + print debugging
simpFunction <- function(n){
print("first print")
if (n > 5){
return(TRUE)
}
print("second print")
if (n < 5){
return(FALSE)
}
print("third print")
return("TRALSE")
}
simpFunction(100)
## [1] "first print"
## [1] TRUE
simpFunction(5)
## [1] "first print"
## [1] "second print"
## [1] "third print"
## [1] "TRALSE"
Not all functions have to return something! Sometimes you may want to create a function that just has some side effect like creating a plot, writing to a file, or print to the console.
These are called “null functions” and they’re common in scripting languages like R. By default, these functions return the R special value NULL.
printSentence <- function(theSentence){
words <- strsplit(
x = theSentence
, split = " "
)
for (word in words){
print(word)
}
}
# Assigning to an object is irrelevant...this function doesn't return anything
x <- printSentence("Hip means to know, it's a form of intelligence")
## [1] "Hip" "means" "to" "know," "it's"
## [6] "a" "form" "of" "intelligence"
x
## NULL
Remember when we talked about namespaces and how R searches for objects? It’s time to extend that logic to functions…which is where things get a bit weird and hard to understand.
R uses a search technique called lexical scoping
# Define an object + function in the global environment
x <- 4
someFunc <- function(x){
return(x^2)
}
# If you call someFunc without an argument, it will go up one environment and look
# for "X" in the global environment
someFunc(x)
## [1] 16
# If you pass in a new value, it will use that instead. Note that the value of
# x in the global environment remains unchanged.
someFunc(x = 6)
## [1] 36
x
## [1] 4
R’s *apply family of functions are a bit difficult to understand at first, but soon you’ll come to love them. They make your code more expressive, flexible, and parallelizable (more on that final point later). One of the most popular is lapply() (“list apply”), which loops over a thing (e.g. vector, list) and returns a 1-level list. Let’s try it out:
# Create a list
studentList <- list(
kanye = c(80, 90, 100)
, talib = c(95, 85, 99)
, common = c(100, 100, 99)
)
# Better way with lapply
grades <- lapply(studentList, mean)
You’ve seen how to loop over a vector/list and get back a list of function results. This may not be appropriate for some settings. Remember that you cannot execute statistical operations like mean() over a list. For that, we’d probably prefer to have a vector of results. This is where R’s sapply() (“simplified apply”) comes in. sapply() works the same way that lapply() does but returns a vector. Try it for yourself:
# Get some data
data("ChickWeight")
weights <- ChickWeight$weight
# Loop over and encode "above mean" and "below mean"
the_mean <- mean(weights)
meanCheck <- function(val, ref_mean){
if (val > ref_mean){
return("above mean")
}
if (val < ref_mean){
return("below mean")
}
return("equal to the mean")
}
check_vec <- sapply(
X = weights
, FUN = function(x){
meanCheck(val = x, ref_mean = the_mean)
}
)
When analyzing real-world datasets, you may want to use the same looping convention we’ve been discussing, but apply it over many items and the get some summary (such as the median) of the results. This is where R’s apply() function comes in!
apply() allows you to loop over the rows or columns of a data frame and execute an arbitrary function. The code below holds some examples of what can be accomplished with apply().
# Get some data
data("ChickWeight")
# Calculate column-wise range
cwRanges <- apply(
X = ChickWeight
, MARGIN = 2
, FUN = function(x){
range(as.numeric(x))
}
)
# Calculate row-wise range
rwRanges <- apply(
X = ChickWeight
, MARGIN = 1
, FUN = function(blah){
range(as.numeric(blah))
}
)
As you’ve probably already learned, writing code involves a never-ending process of trying this, fixing errors, trying other things, fixing new errors, etc. The process of identifying and fixing errors/bugs is called debugging.
The simplest way to debug an issue in your code is to use print debugging. This approach involves forming expectations about the state of the objects in your environment at each point in your code, then printing those states at each point to find where things broke.
# This function is supposed to sum all the numbers in a vector that are greater than 10
# but it's breaking with a weird error. Let's use print debugging to dig into it
sumBigNums <- function(aVector){
running_sum <- 0
for (num in aVector){
if (num >= 10){
running_sum <- running_sum + num
}
}
return(running_sum)
}
# This should return 45...hmmmmmm
some_vector <- c(11, 20, 5, 9.5, 10, 14)
sumBigNums(some_vector)
## [1] 55
# Let's refactor this function (change the code) and print the status at each stage
sumBigNums <- function(aVector){
running_sum <- 0
for (num in aVector){
# Grab the sum before we hit this number
sum_before <- running_sum
# If the number if greater than 10, increase the sum
if (num >= 10){
running_sum <- running_sum + num
}
# print new state of the environment
msg <- paste0("num: ", num, " | before: ", sum_before, " | after: ", running_sum)
print(msg)
}
return(running_sum)
}
# Run the code again and look at the output...it looks like we added the number
# 10 to the count even though it isn't greater than 10! We need to change
# the code to say "num > 10", not ">="
sumBigNums(some_vector)
## [1] "num: 11 | before: 0 | after: 11"
## [1] "num: 20 | before: 11 | after: 31"
## [1] "num: 5 | before: 31 | after: 31"
## [1] "num: 9.5 | before: 31 | after: 31"
## [1] "num: 10 | before: 31 | after: 41"
## [1] "num: 14 | before: 41 | after: 55"
## [1] 55
This example shows some interesting data visualizations you can make using {data.table}, {quantmod}, and {rbokeh}.
Install these packages:
install.packages(c("data.table", "quantmod", "rbokeh"))
``
```r
# Load dependencies
library(data.table)
library(quantmod)
library(rbokeh)
# Get data and plot it
quantmod::getSymbols('CPIAUCSL', src = 'FRED')
## [1] "CPIAUCSL"
cpiDT <- data.table::data.table(
CPIAUCSL
, keep.rownames = TRUE
)
# Plot it!
rbokeh::figure(
data = cpiDT
, title = "U.S. CPI"
, ylab = "Index (1982-1984 = 100)"
, xlab = "date"
) %>% ly_lines(x = cpiDT$index, y = cpiDT$CPIAUCSL, color = "blue")
In the examples so far, I’ve been defining functions and using them in the same breath. As your projects grow in complexity, you will find this really tedious and hard to keep track of.
An alternative that many intermediate R programmers turn to is defining one or more helperfuns.R scripts to store custom functions and then using source() at the top of their run scripts to make those functions available in the main program.
Let’s create two scripts.
script 1: helperfuns.R
# Define some arbitrary custom function
myFunction <- function(n){
if (n <= 5){
return(n^2)
} else {
return(n^3)
}
}
script 2: run_script.R
# uncomment the line below to source in the helper functions file
#source("helperfuns.R")
# Apply that function over some data
le_stuff <- sapply(c(1:20), myFunction)
# Plot the result
plot(x = 1:20, y = le_stuff)
Whenever you find yourself reading data into R or writing data out of it, you will need to work with file paths. File paths are just addresses on your computer’s file system. These paths can either be relative (expressed as steps above/below your current location) or absolute (full addresses).
All relative paths in R are relative to your working directory, a single location that you can set and reset any time in your session.
# Check and then change the current working directory
print(getwd())
sandbox_repo <- file.path(Sys.getenv("HOME"), "repos", "sandbox")
setwd(sandbox_repo)
# Reference a file with a full path
myDF <- read.csv(
file = file.path(
sandbox_repo
, "data"
, "some_data.csv"
)
)
# Reference a file with a relative path
myDF <- read.csv(file = "./data/some_data.csv")
R provides a few other utilities for working with file paths and directory structures.
file.path(): create a filepath from multiple partsfile.exists(): returns TRUE is a file exists and FALSE if it doesn’tlist.files(): get a character vector with names of all files found in a directorydir.exists(): returns TRUE is a directory exists and FALSE if it doesn’tdir.create(): create a new directory# List all the files in some directory and put the list in a vector
docs_dir <- file.path(
Sys.getenv("HOME")
, "some_folder"
, "docs"
)
theFiles <- list.files(path = docs_dir)
# Create a directory if it doesn't exist
slide_dir <- file.path(docs_dir, "slides")
if (!dir.exists(slide_dir)) {
dir.create(slide_dir)
}
# Check if a file exists
report_file <- file.path(docs_dir, "report.xlsx")
myFileExists <- file.exists(report_file)
To download files hosted on the internet, you can use download.file().
download.file(
url = "https://raw.githubusercontent.com/jameslamb/teaching/main/mu_rprog/sample-data/iris.csv"
, destfile = "iris.csv"
)
CSV stands for “comma-separated values”. This format is a really common to share small, tabular datasets because it is just a text file, and can be ready by many different types of programs. R has several options for reading this type of file.
read.csv(): base R function for reading CSVs into a data.framedata.table::fread(): super-fast CSV reader that creates a special type of data.frame called a data.tableread.delim(): similar to read.csv(), but can read files with any delimiterreadr::read_csv(): CSV reader from RStudioA CSV is just a plaintext file where the first row is column names separated by columns and each following row is an observation with columns separated by commas.
date,open,close
01-01-2012,10.5,8.9
01-02-2012,8.9,10.3
To begin this example, download the example file from the course repository.
iris_url <- "https://raw.githubusercontent.com/jameslamb/teaching/main/mu_rprog/sample-data/iris.csv"
iris_file <- "iris.csv"
download.file(
url = iris_url
, destfile = iris_file
)
Then read it in with read.csv() and inspect it with head().
irisDF <- read.csv(
file = iris_file
, header = TRUE
)
print(head(irisDF))
This function can also read CSV data directly from files on the internet.
irisDF <- read.csv(
file = iris_url
, header = TRUE
)
print(head(irisDF))
In the Economics / Business world (and many other areas!), Microsoft Excel is pretty much unavoidable. You’ll get data from the internet, your colleagues, clients, etc. in Excel format and may want to work with it in R. There are a few packages for doing this, but in this course we’ll focus on openxlsx.
NOTE: This package requires certain Java components that you may not have on your machine. If you run into issues, I recommend 1) installing an updated version of JRE or 2) exploring other packages like xlsx or readxl.
To begin this example, download the example file from the course repository.
gdp_url <- "https://raw.githubusercontent.com/jameslamb/teaching/main/mu_rprog/sample-data/gdp.xlsx"
gdp_file <- "gdp.xlsx"
download.file(
url = gdp_url
, destfile = gdp_file
)
Then read it in with openxlsx::read.xlsx() and view it with head().
library(openxlsx)
gdpDF <- openxlsx::read.xlsx(
xlsxFile = gdp_file
)
print(head(gdpDF))
This function can also read Excel data directly from files on the internet.
library(openxlsx)
gdpDF <- openxlsx::read.xlsx(
xlsxFile = gdp_url
)
print(head(gdpDF))
You can also use this package to write Excel files. You can do really complicated stuff (like conditional formatting, named ranges, and live formulas) from inside of R. It’s tough to set up at first, but can be VERY useful if you find yourself spending a lot of time running routine reports whose format is the same from update to update.
# load mtcars
data("mtcars")
# create a workbook object in R
testWB <- openxlsx::createWorkbook()
# Add sheets and data
openxlsx::addWorksheet(
wb = testWB
, sheetName = "car_data"
)
openxlsx::writeData(
wb = testWB
, sheet = "car_data"
, x = mtcars
)
# Write out the file
testing_file <- file.path(Sys.getenv("HOME"), "testing.xlsx")
openxlsx::saveWorkbook(
wb = testWB
, file = testing_file
)
JSON (“Javascript Object Notation”) is a standard format for “semi-structured” or “nested” data. It’s a plain-text format that can be used by many programs and programming languages.
{
"status": 200,
"data": [
{"customer_name": "Lupe", "purchases": 10},
{"customer_name": "Wale", "purchases": 30}
]
}
To begin this example, download the example file from the course repository.
tweet_url <- "https://raw.githubusercontent.com/jameslamb/teaching/main/mu_rprog/sample-data/tweets.json"
tweet_file <- "tweets.json"
download.file(
url = tweet_url
, destfile = tweet_file
)
Open the file in a text editor and inspect its contents. It contains a sample response from the Twitter API, which is used by developers to build applications that interact with Twitter. Here’s a preview:
{
"created_at": "Mon May 06 20:01:29 +0000 2019",
"id": 1125490788736032800,
"id_str": "1125490788736032770",
"text": "Today's new update means that you can finally add Pizza Cat to your Retweet with comments! Learn more about this ne… https://t.co/Rbc9TF2s5X",
"display_text_range": [
0,
140
],
"truncated": true,
"in_reply_to_status_id": null,
"in_reply_to_status_id_str": null,
"in_reply_to_user_id": null,
"in_reply_to_user_id_str": null,
"in_reply_to_screen_name": null,
"user": {
"id": 2244994945,
"id_str": "2244994945",
"name": "Twitter Dev",
"screen_name": "TwitterDev",
"location": "Internet",
"url": "https://developer.twitter.com/",
"description": "Your official source for Twitter Platform news, updates & events. Need technical help? Visit https://twittercommunity.com/ ⌨️ #TapIntoTwitter",
"translator_type": "null"
}
}
There are a few packages for reading this type of data in R. The example below uses one of the most popular, {jsonlite}. Read in the file with jsonlite::fromJSON() and view its contents with str().
tweetList <- jsonlite::fromJSON(
txt = tweet_file
, simplifyDataFrame = FALSE
)
str(tweetList, max.level = 3)
This function can also read JSON data directly from files on the internet.
tweetList <- jsonlite::fromJSON(
txt = tweet_url
, simplifyDataFrame = FALSE
)
str(tweetList, max.level = 3)
In the context of statistical programming, text files that are “unstructured” are those that do not have an obvious parallel in a data object like a data frame, vector, list, or key-value store. An example might be a large archive of tweets or a collection of court transcripts.
Working with these types of files typically involves reading them into R line-by line and then using something called regular expression(basically just pattern-matching on strings) to extract relevant features like word counts. For example:
# Parameters
shakespeare_url <- "https://ia800300.us.archive.org/5/items/shakespearessonn01041gut/wssnt10.txt"
# Read all the text into a vector (one line per element)
text_vec <- readLines(con = shakespeare_url)
# Examine a few random lines:
sample(text_vec, 5)
## [1] "Then can I drown an eye, unused to flow,"
## [2] "And like enough thou know'st thy estimate,"
## [3] ""
## [4] "So long as youth and thou are of one date;"
## [5] " Die single and thine image dies with thee."
# Coerce everything to lower-case
text_vec <- tolower(text_vec)
# Grab just the lines with the word "thou" or "twas""
thou_art_my_vector <- text_vec[grepl(pattern = "thou|twas", text_vec)]
# Print the count of lines w/ those words
lines_with_fancy_words <- length(thou_art_my_vector)
thing_to_print <- paste0(
"of the "
, length(text_vec)
, " lines of text in all of Shakespeare's sonnet, "
, lines_with_fancy_words
, " used thou or twas."
)
print(thing_to_print)
## [1] "of the 2921 lines of text in all of Shakespeare's sonnet, 307 used thou or twas."
NA is a special object in R, used to capture the idea of “a value whose value is unknown”. Confusing, right? We’re going to go through a few examples to get you feeling comfortable with missing values. They’re an inevitability in real-world data.
PRO TIP: See ?NA for R’s documentation on the nuances of NA
# Create a vector w/ missing data
some_nums <- c(1,2,NA, 6, NA, 8)
print(some_nums)
## [1] 1 2 NA 6 NA 8
# Use is.na() to get a vector of TRUE/FALSE for the question "is this element NA?"
is.na(some_nums)
## [1] FALSE FALSE TRUE FALSE TRUE FALSE
# Confirm that even w/ NAs, R still knows this is a numeric vector
class(some_nums)
## [1] "numeric"
The first approach you may take to dealing with NA values is to simply drop them from your data. If you don’t think these missing data have any business value and your dataset is big enough that you can afford to drop some rows / columns, this is the right move for you.
# Removing NAs for vectors
top5 <- c(
"Wale"
, "Chance"
, NA
, "Lupe Fiasco"
, "Shad"
, "Kanye"
, NA
)
print(top5)
## [1] "Wale" "Chance" NA "Lupe Fiasco" "Shad"
## [6] "Kanye" NA
top5cleaned <- top5[!is.na(top5)]
print(top5cleaned)
## [1] "Wale" "Chance" "Lupe Fiasco" "Shad" "Kanye"
# Removing rows with ANY NAs for data.frames
myDF <- data.frame(
x = c(1, 2, NA, 4)
, y = c(NA, TRUE, FALSE, TRUE)
, z = c("hey", "there", NA, "friends")
)
cleanDF <- myDF[complete.cases(myDF), ]
You may find the “remove all the NAs everywhere” strategy a bit too aggreesive for your use case. If you have a 100-variable dataset and a single variable (column) is 90% NA values, do you really want to drop every row where that variable is NA? A better approach might be to selectively subset out columns where missing values are most severe before using complete.cases() to remove rows.
# Create a data frame where some variable have more NAs than others
testDF <- data.frame(
var1 = sample(c(rnorm(99), NA), 200, replace = TRUE)
, var2 = sample(c(rnorm(50), rep(NA, 50)), 200, replace = TRUE)
, var3 = sample(c(rnorm(5), rep(NA, 95)), 200, replace = TRUE)
)
# Find columns that are more than 90% missing values
.percent_na <- function(a_vector){
return(sum(is.na(a_vector)/length(a_vector)))
}
colsToDrop <- apply(testDF, MARGIN = 2, .percent_na) > 0.9
cleanDF <- testDF[, !colsToDrop]
# Remove rows w/ remaining NAs
cleanDF <- cleanDF[complete.cases(cleanDF),]
A final strategy, particularly useful in modeling contexts, is to use some imputation strategy to replace NA values with reasonable alternatives. One common approach (and my favorite), the roughfix method. It works like this: - For numeric columns, replace NAs with the column median - For categorical columns, replace NAs with the most common value
# Create a data frame where some variable have more NAs than others
testDF <- data.frame(
var1 = sample(c(rnorm(99), NA), 500, replace = TRUE)
, var2 = sample(c(rnorm(70), rep(NA, 30)), 500, replace = TRUE)
, var3 = sample(c(rnorm(85), rep(NA, 15)), 500, replace = TRUE)
)
# Clean up w/ roughfix
library(randomForest)
cleanDF <- randomForest::na.roughfix(testDF)
R is famous, in part, for its ability to create production-quality plots within the default graphics package it ships with. This plotting paradigm is often referred to as “the base plotting system”.
The essential idea of the base plotting system is to build up plots in layers. You first create a simple 1-variable line plot, for example, then “add on” a legend, more variables, other plot types, etc. We’ll try a few examples using the sample data created below.
# Load up the famous iris dataset
data("iris")
head(iris, n = 10)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3.0 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## 4 4.6 3.1 1.5 0.2 setosa
## 5 5.0 3.6 1.4 0.2 setosa
## 6 5.4 3.9 1.7 0.4 setosa
## 7 4.6 3.4 1.4 0.3 setosa
## 8 5.0 3.4 1.5 0.2 setosa
## 9 4.4 2.9 1.4 0.2 setosa
## 10 4.9 3.1 1.5 0.1 setosa
Let’s start with a simple line plot to answer the question are sepal length and sepal width related?.
# Create a simple line plot
plot(
x = iris$Sepal.Length
, y = iris$Sepal.Width
, type = "p"
)
# Try again, but with labels!
plot(
x = iris$Sepal.Length
, y = iris$Sepal.Width
, main = "My Second R plot!"
, xlab = "sepal length"
, ylab = "sepal width"
, type = "p"
)
# Try it AGAIN, this time coloring by species and a legend
plot(
x = iris$Sepal.Length
, y = iris$Sepal.Width
, main = "My Third R plot!"
, xlab = "sepal length"
, ylab = "sepal width"
, type = "p"
, col = iris$Species
, bg = iris$Species
, pch = 21
)
# Add a legend
legend(
x = 7
, y = 4.3
, unique(iris$Species)
, col = 1:length(iris$Species)
, pch = 1
)
The base plotting system can be a great tool for quick exploratory analysis of data, such as examination of the distribution of variables in your data.
# Minimal Histogram
hist(iris$Petal.Length)
# Better histogram
hist(
iris$Petal.Length
, main = "Distribution of petal length"
, xlab = "petal length"
, breaks = 25
)
# Empirical density
plot(
density(iris$Petal.Length)
, main = "Empirical density of petal length"
, col = "blue"
)
You can add more than one variable to these plots! Let’s compare the densities of Sepal length by species
# Overlay densities of Petal length by species
plot(
density(iris[iris$Species == "setosa", "Petal.Length"])
, main = "Empirical density of petal length"
, col = "blue"
, xlim = c(0, 7)
, ylim = c(0, 2.5)
)
lines(density(iris[iris$Species == "versicolor", "Petal.Length"]), col = "red")
lines(density(iris[iris$Species == "virginica", "Petal.Length"]), col = "black")
# Add a legend
legend(
x = 5.5
, y = 2.25
, legend = unique(iris$Species)
, col = c("blue", "red", "black")
, pch = 1
)
You can control the plotting options to make a grid of plots. The code below creates a 2x2 grid with a density for Sepal Width and scatter plots of the other three variables against sepal width.
# Set global options
par(mfrow = c(2,2))
# Dump in some plots
plot(
density(iris$Sepal.Width)
, main = "Empirical density of petal length"
, col = "red"
)
plot(
x = iris$Sepal.Width
, y = iris$Sepal.Length
, col = iris$Species
, bg = iris$Species
, pch = 21
)
plot(
x = iris$Sepal.Width
, y = iris$Petal.Length
, col = iris$Species
, bg = iris$Species
, pch = 21
)
plot(
x = iris$Sepal.Width
, y = iris$Petal.Width
, col = iris$Species
, bg = iris$Species
, pch = 21
)
# reset options
par(mfrow = c(1,1))
When R (or any other program!) creates plots, it needs to know where to put them! When you call plot() or other commands from within and RStudio session, the default is to just display the resulting figure in the “Plots” pane. However, you can use other graphics devices (places to put visual output) to tell R to put your figures elsewhere.
# Create 10 plots in a loop
outDir <- getwd()
for (i in 1:10){
# Open a connection to a .png file
filePath <- paste0(outDir, "/plot_", i, ".png")
png(filePath)
# Write out a plot to that file
plot(x = rnorm(100), y = rnorm(100))
# Close the connection to that file
dev.off()
}
R Was Made for Statistics!
In this section, we’re going to walk through all the steps of basic statistical analysis: getting data, exploring it, creating features, building models, evaluating models, and comparing those models’ performance.
We’re going to work with R’s swiss dataset. This cross-sectional dataset contains measures of fertility and some economic indicators collected in Switzerland in 1888. Our first task will be to load the data and immediately hold out a piece of it as testing data. The idea here is that when we evaluate the performance of our models later on, it will be better to do it on data that the models haven’t seen. This will give a more honest picture of how well they might perform on new data.
In this exercise, we’re going to evaluate the following question: Can we predict fertility based on regional socioeconomic characteristics?
# Load in some data
data("swiss")
# Research: What do the variables mean?
?swiss
# Test vs. Training Data: Let's hold out 40 random records right now for testing
testIndx <- sample(1:nrow(swiss), size = 10, replace = FALSE)
swissTestDF <- swiss[testIndx,]
swiss <- swiss[-testIndx,]
Once you’ve split your data into training and test, you should start poking it a bit to see if you find anything interesting. You can use str() to view the contents of your data objects, summary() to view some basic summary statistics on data frame columns, and cor() to get a correlation matrix between all pairs of numeric variables.
# Look at the structure
str(swiss)
## 'data.frame': 37 obs. of 6 variables:
## $ Fertility : num 80.2 83.1 92.5 85.8 76.9 76.1 83.8 92.4 82.9 64.1 ...
## $ Agriculture : num 17 45.1 39.7 36.5 43.5 35.3 70.2 67.8 45.2 62 ...
## $ Examination : int 15 6 5 12 17 9 16 14 16 21 ...
## $ Education : int 12 9 5 7 15 7 7 8 13 12 ...
## $ Catholic : num 9.96 84.84 93.4 33.77 5.16 ...
## $ Infant.Mortality: num 22.2 22.2 20.2 20.3 20.6 26.6 23.6 24.9 24.4 16.5 ...
# Tables of summary stats
summary(swiss)
## Fertility Agriculture Examination Education
## Min. :35.00 Min. : 1.20 Min. : 3.0 Min. : 1.00
## 1st Qu.:64.40 1st Qu.:34.00 1st Qu.:12.0 1st Qu.: 7.00
## Median :70.50 Median :50.90 Median :16.0 Median : 9.00
## Mean :69.87 Mean :49.77 Mean :16.3 Mean :12.27
## 3rd Qu.:79.30 3rd Qu.:67.80 3rd Qu.:22.0 3rd Qu.:13.00
## Max. :92.50 Max. :89.70 Max. :37.0 Max. :53.00
## Catholic Infant.Mortality
## Min. : 2.15 Min. :10.80
## 1st Qu.: 6.10 1st Qu.:18.10
## Median : 18.46 Median :20.20
## Mean : 45.46 Mean :19.99
## 3rd Qu.: 93.40 3rd Qu.:22.20
## Max. :100.00 Max. :26.60
# Correlation matrix
round(cor(swiss), 2)
## Fertility Agriculture Examination Education Catholic
## Fertility 1.00 0.40 -0.67 -0.73 0.44
## Agriculture 0.40 1.00 -0.72 -0.66 0.47
## Examination -0.67 -0.72 1.00 0.77 -0.62
## Education -0.73 -0.66 0.77 1.00 -0.24
## Catholic 0.44 0.47 -0.62 -0.24 1.00
## Infant.Mortality 0.37 -0.08 -0.11 -0.12 0.07
## Infant.Mortality
## Fertility 0.37
## Agriculture -0.08
## Examination -0.11
## Education -0.12
## Catholic 0.07
## Infant.Mortality 1.00
Though orthodox hypothesis tests have been maligned in the scientific press recently, they are still enormously valuable tools for discovering differences in datasets and evaluates relationships between variables.
One approach to looking for statistically interesting features (right-hand-side variables) involves binning those variables into “above median” and “below median”, and using a paired t-test to see whether or not the variable is statistically related to the target.
# t-tests
# 1. Is fertility very different in provinces with above-median % of men in Agriculture
swiss$majority_agg <- swiss$Agriculture > median(swiss$Agriculture)
t.test(Fertility ~ majority_agg, data = swiss)
##
## Welch Two Sample t-test
##
## data: Fertility by majority_agg
## t = -1.5989, df = 29.458, p-value = 0.1205
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -15.475664 1.890284
## sample estimates:
## mean in group FALSE mean in group TRUE
## 66.56842 73.36111
# 2. Let's create this feature for every variable (we'll re-use it)
#=== a. Drop that majority_agg feature
swiss$majority_agg <- NULL
#=== b. Create the binary columns
x_var_names <- names(swiss)[names(swiss) != "Fertility"]
for (var_name in x_var_names){
bin_var_name <- paste0(var_name, "_above_median")
swiss[, bin_var_name] <- swiss[, var_name] > median(swiss[, var_name])
# Remember to give the test set these new features!
swissTestDF[, bin_var_name] <- swissTestDF[, var_name] > median(swiss[, var_name])
}
We can use lapply() to apply the same test to every variable in our data frame.
# 3. Could just loop over every non-Fertility column and do this test!
bin_var_names <- grep("_above_median", names(swiss), value = TRUE)
t_tests <- lapply(bin_var_names, function(bin_var){
t_test <- t.test(
Fertility ~ get(bin_var)
, data = swiss
)
p_val <- t_test$p.value
return(data.frame(
signal = bin_var
, p_val = round(t_test$p.value, 2)
, t_stat = round(t_test$statistic, 2)
))
})
tDT <- data.table::rbindlist(t_tests)
tDT
## signal p_val t_stat
## 1: Agriculture_above_median 0.12 -1.60
## 2: Examination_above_median 0.00 3.86
## 3: Education_above_median 0.00 3.67
## 4: Catholic_above_median 0.10 -1.74
## 5: Infant.Mortality_above_median 0.04 -2.13
Ok now that we’ve done some basic preprocessing and exploration, it’s time to start fitting models! Let’s begin with a simple one-variable OLS regression, estimated using the lm() command.
# 4. Simple regression: Fertility = f(Agriculture)
mod1 <- lm(Fertility ~ Agriculture, data = swiss)
# model summary
summary(mod1)
##
## Call:
## lm(formula = Fertility ~ Agriculture, data = swiss)
##
## Residuals:
## Min 1Q Median 3Q Max
## -24.471 -7.913 -2.121 9.400 24.858
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 58.84560 4.74172 12.410 2.25e-14 ***
## Agriculture 0.22157 0.08599 2.577 0.0143 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 12.42 on 35 degrees of freedom
## Multiple R-squared: 0.1595, Adjusted R-squared: 0.1354
## F-statistic: 6.64 on 1 and 35 DF, p-value: 0.01435
# QQ plot (are the residuals normal?)
plot(mod1, which = 2)
Once you have the fitted model, you can pass it and some new data to predict() to generate predictions. I’ve also defined calculations of the mean absolute error, mean squared error, and mean absolute percent error, common error metrics used in regression problems.
# Check how well our model does on the held-out data
preds <- predict(mod1, newdata = swissTestDF)
errors <- swissTestDF$Fertility - preds
MAE <- mean(abs(errors))
MSE <- mean(errors^2)
MAPE <- 100*mean(abs(errors)/swissTestDF$Fertility)
# Actuals vs. Preds plot
plot(
x = swissTestDF$Fertility
, y = preds
, xlab = "Actual Fertility"
, ylab = "Predicted Fertility"
, main = "Predictions from a simple OLS model"
, ylim = c(0, 100)
, xlim = c(0, 100)
)
lines(x = 0:100, y = 0:100, col = "red")
The error metric calculations and plotting we just did seem general enough to apply to other models of this phenomenon. Since we could reuse the code for other models, we should just wrap it in a function so it’s easy to call downstream.
# 5. Before moving on...lets wrap that pipeline in a function
EvaluateModel <- function(model, testDF, modelName){
# Check how well our model does on the held-out data
preds <- predict(model, newdata = testDF)
errors <- testDF$Fertility - preds
MAE <- mean(abs(errors))
MSE <- mean(errors^2)
MAPE <- 100 * mean(abs(errors) / testDF$Fertility)
# Actuals vs. Preds plot
plot(
x = testDF$Fertility
, y = preds
, xlab = "Actual Fertility"
, ylab = "Predicted Fertility"
, main = paste0(modelName, ": predictions")
, ylim = c(0, 100)
, xlim = c(0, 100)
)
# Add a 45-degree line
lines(x = 0:100, y = 0:100, col = "red")
# add error metrics
text(x = rep(5,3), y = c(72,80,88), labels = c("MAE: ", "MSE: ", "MAPE: "))
text(x = rep(15,3), y = c(72,80,88), labels = round(c(MAE, MSE, MAPE), 2))
# Give back the preds in case user wants to do something customized
return(list(
pred = preds
, MAE = MAE
, MSE = MSE
, MAPE = MAPE
))
}
To add more variables to a model in R, you have to use formula notation.
# Fit
mod2 <- lm(
Fertility ~ Education + Infant.Mortality + Examination_above_median
, data = swiss
)
# Predict + Evaluate
regPreds2 <- EvaluateModel(
mod2
, testDF = swissTestDF
, modelName = "Expanded OLS"
)
Linear regressions are powerful tools, but there are certain classes of complex relationships that they are unable to express and therefore unable to “learn” when trained on data.
Regression Trees are one mainstream tool used to fit complex functions. They recursively partition the dataset, trying to find “local” models of subsets of the data which, together, provide a better fit than the single “global” OLS model we’re used to fitting.
The details of tree-based learning are outside the scope of this class, but I encourage you to check out A Visual Introduction to Machine Learning to at least get the general intuition behind this and other ML techniques.
# Fit
treeMod <- rpart::rpart(Fertility ~ ., data = swiss)
# Evaluate
dtPreds <- EvaluateModel(
treeMod
, swissTestDF
, "Decision Tree"
)
You can walk through the tree with this awesome package called rattle. This package can be super hard to build… if you run into any issues, there are lots of nice options available via the rpart.plot package.
# Plot the tree
rattle::fancyRpartPlot(
treeMod
, main = "Single Decision Tree"
)
For larger datasets than the one we’re working with in this exercise, you may find that a single decision tree is not expressive enough or that one strong signal overpowers all the other variables. To correct for this, many analysts will use a “forest” of decision trees. The implementation detaisl are again outside the scope of this course: see Leo Breiman’s excellent site if you’re curious in learning more about this algorithm.
I only mention the random forest here to demonstrate how easy it is to fit and predict with complex statistical models in R.
# Fit
rfMod <- randomForest::randomForest(
Fertility ~ .
, data = swiss
, ntree = 50
, do.trace = TRUE
)
## | Out-of-bag |
## Tree | MSE %Var(y) |
## 1 | 432 248.82 |
## 2 | 375.4 216.21 |
## 3 | 293.6 169.08 |
## 4 | 208.2 119.93 |
## 5 | 202.7 116.74 |
## 6 | 206.6 119.00 |
## 7 | 164.9 94.98 |
## 8 | 159.9 92.10 |
## 9 | 156.5 90.13 |
## 10 | 154.2 88.79 |
## 11 | 151 86.94 |
## 12 | 145.5 83.79 |
## 13 | 148 85.22 |
## 14 | 143.6 82.72 |
## 15 | 137.9 79.41 |
## 16 | 131.8 75.89 |
## 17 | 130.7 75.28 |
## 18 | 129.4 74.54 |
## 19 | 125.6 72.36 |
## 20 | 125.4 72.24 |
## 21 | 120.9 69.61 |
## 22 | 120.3 69.28 |
## 23 | 121.9 70.19 |
## 24 | 119.4 68.79 |
## 25 | 118.4 68.22 |
## 26 | 116.3 66.98 |
## 27 | 117.3 67.54 |
## 28 | 117.1 67.45 |
## 29 | 115.5 66.52 |
## 30 | 113.9 65.58 |
## 31 | 109.2 62.91 |
## 32 | 108.9 62.72 |
## 33 | 104.9 60.42 |
## 34 | 105.9 60.99 |
## 35 | 106.1 61.11 |
## 36 | 107.5 61.91 |
## 37 | 105.4 60.69 |
## 38 | 105.7 60.85 |
## 39 | 104.1 59.96 |
## 40 | 102.6 59.06 |
## 41 | 102.7 59.14 |
## 42 | 102.1 58.81 |
## 43 | 102.4 58.97 |
## 44 | 100.8 58.05 |
## 45 | 101 58.18 |
## 46 | 101 58.19 |
## 47 | 100.3 57.76 |
## 48 | 100 57.62 |
## 49 | 101 58.19 |
## 50 | 101.6 58.51 |
# Evaluate
rfPreds <- EvaluateModel(rfMod, swissTestDF, "Random Forest (ntree=100)")
In a real project, at some point you will find yourself saying “ok, I have all these models. What should I actually use to generate predictions?”. This question was our motivation for holding out a test set at the beginning of this walkthrough. Accuracy when predicting on previously unseen data is a totally defensible measure for choosing the “best” model from a set of candidate models.
Let’s see how the models we trained in this exercise performed:
# Build a table showing performance of each model type on the holdout
compDF <- data.frame(
modelName = c("Expanded OLS", "Decision Tree", "Random Forest (ntree=100)")
, MAE = c(regPreds2$MAE, dtPreds$MAE, rfPreds$MAE)
, MSE = c(regPreds2$MSE, dtPreds$MSE, rfPreds$MSE)
, MAPE = c(regPreds2$MAPE, dtPreds$MAPE, rfPreds$MAPE)
)
compDF
## modelName MAE MSE MAPE
## 1 Expanded OLS 5.965018 43.31007 8.791261
## 2 Decision Tree 4.358667 31.96601 6.330566
## 3 Random Forest (ntree=100) 3.925778 22.75089 5.817378
In situations where you have multiple data frames with the same rows but different columns, you can combine them column-wise with R’s cbind() command. Note that this command will only work if the two data frames to be joined have the same number of rows AND those rows refer to the same observation.
cbind = “column-wise bind”
x <- data.frame(
person = c(
"Emily", "Stephanie", "Chizom", "Corey",
"Vicki", "Victor", "Swift"
)
, followers = c(200, 1200, 800, 18000, 21775, 1123, 28965)
)
y <- data.frame(
type = c(
"personal", "tech", "personal", "tech",
"tech", "news", "tech"
)
)
cbind(x, y)
## person followers type
## 1 Emily 200 personal
## 2 Stephanie 1200 tech
## 3 Chizom 800 personal
## 4 Corey 18000 tech
## 5 Vicki 21775 tech
## 6 Victor 1123 news
## 7 Swift 28965 tech
cbind() works in the limited situation where you have two data frames that can just be jammed together (same number of rows + rows line up). This doesn’t happen too often. However, it is VERY common in data science workflows to have two mismatched tables of data from different sources and to want to combine them by matching on one or more keys. Think JOIN in SQL or VLOOKUP in Excel. To perform this operation in R, you can use the merge() command.
x <- data.frame(
person = c(
"Emily", "Stephanie", "Chizom", "Corey",
"Vicki", "Victor", "Swift"
)
, followers = c(200, 1200, 800, 18000, 21775, 1123, 28965)
, type = c(
"personal", "tech", "personal", "tech",
"tech", "news", "tech"
)
)
y <- data.frame(
type = c("news", "personal", "tech")
, avg_followers = c(900, 55, 1300)
)
merge(x, y)
## type person followers avg_followers
## 1 news Victor 1123 900
## 2 personal Emily 200 55
## 3 personal Chizom 800 55
## 4 tech Stephanie 1200 1300
## 5 tech Corey 18000 1300
## 6 tech Vicki 21775 1300
## 7 tech Swift 28965 1300
So far we’ve talked about merging columns from different tables. But what if you want to merge rows? For example, imagine that you are a researcher in a lab studying some natural phenomenon. You may take multiple samples (measuring the same variables) and then want to put them together into a single data frame to build a model. For this case, we can use R’s rbind() function.
rbind = “row-wise bind”
x <- data.frame(
grade = c("A", "B", "A")
, pct = c(95.1, 86.5, 98.3)
, col3 = rep(FALSE, 3)
)
y <- data.frame(
grade = c("B", "A", "A")
, pct = c(88.4, 99.9, 97.6)
, col3 = rep(TRUE, 3)
)
fullDF <- rbind(x, y)
print(fullDF)
## grade pct col3
## 1 A 95.1 FALSE
## 2 B 86.5 FALSE
## 3 A 98.3 FALSE
## 4 B 88.4 TRUE
## 5 A 99.9 TRUE
## 6 A 97.6 TRUE
# compare
dim(x)
## [1] 3 3
dim(y)
## [1] 3 3
dim(fullDF)
## [1] 6 3
rbind() works as a strategy for combining two tables, but what if you have 5 tables? 10? 1000? For those situations, you should checkout rbindlist() from the {data.table} package.
library(data.table)
df1 <- data.frame(
grade = c("A", "B", "A")
, pct = c(95.1, 86.5, 98.3)
)
df2 <- data.frame(
grade = c("B", "A", "A")
, pct = c(88.4, 99.9, 97.6)
)
df3 <- data.frame(
grade = c("C", "D", "F")
, pct = c(71.5, 68.7, 66.4)
, appeal = c(FALSE, FALSE, TRUE)
)
# put all the data frames in a list
df_list <- list(df1, df2, df3)
print(str(df_list, max.level = 1))
## List of 3
## $ :'data.frame': 3 obs. of 2 variables:
## $ :'data.frame': 3 obs. of 2 variables:
## $ :'data.frame': 3 obs. of 3 variables:
## NULL
# get one new big dataframe
fullDF <- data.table::rbindlist(
l = df_list
, fill = TRUE
)
print(fullDF)
## grade pct appeal
## 1: A 95.1 NA
## 2: B 86.5 NA
## 3: A 98.3 NA
## 4: B 88.4 NA
## 5: A 99.9 NA
## 6: A 97.6 NA
## 7: C 71.5 FALSE
## 8: D 68.7 FALSE
## 9: F 66.4 TRUE
Ok so you have a business question, you’ve chosen your toolchain (presumably with R), and you have some data in hand. You sit down to write code and, well…
Don’t fear! In this section, we’re going to walk through the process of building a non-trivial script from scratch.
Resist the urge to just start writing code. Investing a few minutes upfront in thinking through the structure of your code will pay off in a big way as the project evolves and grows more complicated. Trust me.
First, just write down the main things you want to do. In this exercise, we’re going to write some R code that can generate n-page “books” or random sentences in English.
#=== Write an R script that writes a book! ===#
# Function to create random sentences
# Function to create a "page" with n sentences
# Function to create a "book" with m pages
# Call the book function and create a 5-page book
Next, fill in the high-level outline with slightly more specifics. Try to strategize about the functions you’ll need to implement and the individual steps that will have to happen inside each of those functions. This will probably change once you start writing code, but in my experience it’s always easier to have a plan and change it.
# Function to create random sentences
createSentence <- function(){
# Define a list of nouns
# Define a list of adjectives
# Define a list of adverbs
# Define a list of verbs
# Define a list of articles
# Define a list of prepositions
# Choose randomly from each, construct a sentence of the form
# article-adjective-noun-adverb-verb-preposition
# Return the sentence
}
# Function to create a "page" with n sentences
createPage <- function(n){
# Call the function to create pages n times
# Paste the n results together into one string, separated by a period and a space.
# Return that one string
}
# Function to create a "book" with m pages
createBook <- function(n_pages, sentences_per_page){
# Call the function to create pages n_pages times
# Return a list with 1 page per element
}
# Call the book function and create a 5-page book
You’ll spend the most time on this and you will have to go through it many times before you’re feeling comfortable with the code. I’ve given one implementation below. The point here is not to show you how to create a book-writing app, but rather to demonstrate that this somewhat complicated task was made easier by breaking it into smaller, more manageable pieces.
# Function to create random sentences
createSentence <- function(){
# Define a list of nouns
nouns <- c("parrot", "giant squid", "whale", "captain", "first mate")
# Define a list of adjectives
adjectives <- c("hearty", "brave", "grimy", "tough", "swarthy")
# Define a list of adverbs
adverbs <- c("quickly", "carefully")
# Define a list of verbs
verbs <- c("scurried", "trundled", "rolled", "walked")
# Define a list of articles
articles <- c("a", "the")
# Define a list of prepositions
prepositions <- c("away", "below")
# Construct a sentence of the form article-adjective-noun-adverb-verb.
sentence <- paste(
sample(articles, 1)
, sample(adjectives, 1)
, sample(nouns, 1)
, sample(adverbs, 1)
, sample(verbs, 1)
, sample(prepositions, 1)
)
# Return the sentence
return(sentence)
}
# Function to create a "page" with n sentences
createPage <- function(n){
# Call the function to create pages n times
some_sentences <- sapply(
X = 1:n
, FUN = function(x){
createSentence()
}
)
# Paste the n results together into one string, separated by a period and a space.
a_page <- paste(some_sentences, collapse = ". ")
# Return that one string
return(a_page)
}
# Function to create a "book" with m pages
createBook <- function(n_pages, sentences_per_page){
# Call the function to create pages n_pages times
a_book <- lapply(
X = 1:n_pages
, FUN = function(x){
createPage(n = sentences_per_page)
}
)
# Return a list with 1 page per element
return(a_book)
}
# Call the book function and create a 5-page book
aPirateTale <- createBook(n_pages = 5, sentences_per_page = 10)
print(aPirateTale)
## [[1]]
## [1] "a grimy first mate quickly trundled away. a brave parrot quickly rolled below. the tough giant squid quickly walked below. the brave giant squid carefully rolled below. the hearty first mate carefully trundled away. a brave first mate quickly trundled below. a grimy first mate quickly walked away. a swarthy captain carefully rolled away. a hearty whale carefully scurried below. the swarthy first mate carefully trundled below"
##
## [[2]]
## [1] "a swarthy first mate quickly trundled below. the brave first mate quickly rolled below. a grimy giant squid quickly trundled away. a brave captain carefully rolled below. a hearty first mate carefully trundled below. the grimy giant squid carefully scurried below. the brave captain quickly scurried below. the grimy first mate carefully trundled below. the hearty parrot carefully walked below. a tough giant squid carefully walked away"
##
## [[3]]
## [1] "a swarthy first mate quickly rolled away. the brave whale carefully trundled below. a grimy whale carefully scurried away. the hearty first mate quickly rolled below. the brave first mate carefully rolled below. the tough first mate carefully rolled away. a grimy giant squid quickly scurried away. a grimy parrot quickly walked away. the swarthy first mate quickly trundled below. a swarthy captain quickly walked away"
##
## [[4]]
## [1] "a hearty whale carefully rolled away. a swarthy first mate carefully rolled below. a brave first mate carefully walked away. a hearty first mate quickly scurried below. a swarthy whale quickly walked away. a hearty giant squid quickly trundled below. the brave first mate carefully trundled away. a swarthy whale carefully scurried away. a brave whale carefully trundled away. a brave first mate carefully trundled away"
##
## [[5]]
## [1] "the brave whale quickly walked below. a tough parrot quickly rolled away. the brave first mate quickly scurried below. the brave parrot quickly trundled below. a swarthy giant squid carefully rolled away. a hearty whale carefully walked away. a grimy parrot carefully walked away. the brave parrot quickly scurried below. the hearty parrot quickly walked below. the brave giant squid carefully scurried below"
In this section, we’re going to use the Shakespeare corpus and implement a basic “keyword extraction” pipeline. Let’s start by loading it and doing some common string preprocessing on it.
# Grab a random 5000 lines from the corpus (skipping a bunch of that MIT text at the beginning)
shakespeareFile <- "https://ia800300.us.archive.org/5/items/shakespearessonn01041gut/wssnt10.txt"
bsText <- readLines(con = shakespeareFile, n = 10000)[5001:10000]
# 1 - Convert everything to lowercase
bsText <- tolower(bsText)
# 2 - Trim leading and trailing whitespace (e.g. change " a" to "a")
bsText <- trimws(bsText)
# 3 - Remove empty lines
bsText <- bsText[sapply(bsText, nchar)>0]
After applying these basic steps, we’re going to want to do some more powerful things like removing or replacing text based on particular character patterns. It is time to enter the mystical world of regular expressions.
Regular expressions (used in many programming languages) offer a way to express complex pattern matching. Let’s work through some examples to demonstrate the properties.
# 4 - Remove punctuation
bsText <- gsub(
pattern = ";|,|!|?|\\.|\\:|<|>|\\]|\\["
, replacement = ""
, x = bsText
)
You can run sample(bsText, 10) and see that our data are looking cleaner…but we still have work to do! Next, let’s use regular expressions (commonly just called “regex”) to handle some other issues.
# 5 - split some common contractions into two words
bsText <- gsub("he('s)", "he is", bsText)
bsText <- gsub("'ll", " will", bsText)
# 6 - Change any numbers to __number__
bsText <- gsub("[:digit:]", "__number__", bsText)
The next task we need to acomplish is tokenizing our text, i.e. splitting lines and sentences into individual words. These individual words can then be used downstream to get build a language model and identify key terms.
# 7- Loop over the vector of lines, split on whitespace, create a list of data.frames
library(stringr)
library(data.table)
allWords <- lapply(
X = bsText
, FUN = function(thisLine){
return(data.frame(words = str_split(thisLine, " ")[[1]]))
}
)
# 8 - Put into a data.table and print that just to make sure it worked
wordDT <- data.table::rbindlist(allWords)
print(wordDT[1:10], row.names = FALSE)
## words
## <NA>
## <NA>
## <NA>
## <NA>
## <NA>
## <NA>
## <NA>
## <NA>
## <NA>
## <NA>
Now that our data are a bit cleaner, it’s time to try finding key terms! Broadly speaking, “key terms” in a body of text are those that more common in the text than they are in the language as a whole (e.g. “the” will never be a key word). We aren’t looking at any actual data on the distribution of words in Shakespeare-era English in this exercise, so we’ll just drop the top 20 words and call the next 20 “key”.
The data.table package makes this operation easy to carry out.
# 9 - Get Word counts and sort by those counts
wordCountDT <- wordDT[, .N, by = words]
data.table::setnames(
wordCountDT
, old = "N"
, new = "word_count"
)
data.table::setorder(wordCountDT, -word_count)
print(wordCountDT[1:10], row.names = FALSE)
## words word_count
## <NA> 5000
## <NA> NA
## <NA> NA
## <NA> NA
## <NA> NA
## <NA> NA
## <NA> NA
## <NA> NA
## <NA> NA
## <NA> NA
key_words <- wordCountDT[21:40]
Use this slide as a general reference of coding best practices.
library() calls at the top of your script(s)#===== Section 1 - Do Stuff =====# to separate major sections of the code:: (e.g. lubridate::ymd_hms())For other practices to keep your code clean and readable, check out Hadley Wickham’s style guide for R.
My script below shows an example of a typical layout for professional-quality R code.
make_plot.R
#===== 1. Setup =====#
# Load dependencies
library(data.table)
library(quantmod)
library(rbokeh)
library(purrr)
# Set global params
FRED_SERIES <- "CPIAUCSL" # UNRATE
TITLE <- "U.S. CPI" # "U.S. unemployment""
VAR_UNITS <- "Index (1982-1984 = 100)" # "% of labor force"
#===== 2. Get Data =====#
# Get data from FRED
quantmod::getSymbols(Symbols = FRED_SERIES, src = 'FRED')
## [1] "CPIAUCSL"
# Put it in a data.table
dataDT <- data.table::data.table(
get(FRED_SERIES)
, keep.rownames = TRUE
)
#===== 3. Plot it! =====#
# Plot it!
rbokeh::figure(
data = dataDT
, title = TITLE
, ylab = VAR_UNITS
, xlab = "date"
, width = 800
) %>% rbokeh::ly_lines(
x = dataDT[,index]
, y = dataDT[,get(FRED_SERIES)]
, color = "blue"
)
There are around 10,000 packages currently on CRAN but MANY more privately-created packages in use by academics, professional data science teams, and hobbyists.
I’d like to demystify the package through an exercise…we are going to build an R package from scratch here in class! Packages are a robust and effective way to share your code with others and to manage dependencies in your code.
check out this article on how Airbnb uses internal R packages
We are going to create a package called RquetteBB, a package for fetching, cleaning, and visualizing Marquette basketball statistics.
Before we begin, be sure that you have a few of the mainstream package-building libraries in R. If you don’t have any of these in your R library, please install them:
?library(remotes)
library(roxygen2)
library(testthat)
DESCRIPTION = a file containing information on dependencies, authors, and other useful metadataNAMESPACE = an auto-generated file that contains informaiton on the names of objects defined in your package and the names of objects from other packages that your package relies onLICENSE = a text file with some statement of the license that applies to the package. Don’t stress…this can be informal and goofy.R/ = a directory with all the “.R” functions that your package exportstests/testthat = a directory with all the tests used to verify that your functions are working correctlytests/testthat.R = a script that instructs R how to run your unit testsman/ = a directory with all the documentation that accompanies your functions. The files in here are auto-generated by {roxygen}You can do most of the setup with R code!
setwd()) to wherever you’d like the package to live.repo_directory <- file.path(Sys.getenv("HOME"), "repos")
dir.create(
repo_directory
, showWarnings = FALSE
)
setwd(repo_directory)
package.skeleton() to create a basic package setup# Create the package skeleton
package_name <- "RquetteBB"
package.skeleton(
name = package_name
)
# Remove some unnecessary files
package_path <- file.path(repo_directory, "RquetteBB")
file.remove(file.path(package_path, "Read-and-delete-me"))
file.remove(file.path(package_path, "NAMESPACE"))
unlink(file.path(package_path, "data"), recursive = TRUE)
unlink(file.path(package_path, "man"), recursive = TRUE)
DESCRIPTION in the text editor of choice
0.1Author sectionDescription:License: file LICENSETitle:LICENSElicense_txt <- "You can use my stuff but you better give me credit!"
license_file <- file.path(package_path, "LICENSE")
file.create(license_file)
write(license_txt, file = license_file)
man/ directory to store documentation filesdir.create(file.path(package_path, "man"), showWarnings = FALSE)
tests/ and tests/testthat directories
tests/testthat.Rtests/testthat/test-my_functions.Rtest_path <- file.path(package_path, "tests", "testthat")
dir.create(
test_path
, showWarnings = FALSE
, recursive = TRUE
)
file.create(file.path(package_path, "tests", "testthat.R"))
file.create(file.path(test_path, "test-my_functions.R"))
R/my_functions.Rfile.create(file.path(package_path, "R", "my_functions.R"))
Alternatively, I’ve provided a shell script you can run to generate this package.
make_rquettebb_pkg.sh
# Create the package directory
mkdir -p ${HOME}/repos
cd ${HOME}/repos
# Create package skeleton
Rscript -e "package.skeleton(name = 'RquetteBB', list = c('LETTERS'))"
cd RquetteBB
# Remove unnecessary stuff
rm Read-and-delete-me
rm -rf data/
rm man/*.Rd
rm NAMESPACE
# Create all the required directories
mkdir -p R
mkdir -p tests/testthat
# Create all the required files
echo 'You can use this but you better cite me!\n' > LICENSE
touch R/my_functions.R
touch tests/testthat.R
touch tests/testthat/test-my_functions.R
Next, it’s time to add your first function to the package! Some of the items we discuss later (like editing the DESCRIPTION file) will really only make sense if we have a least one function in the package.
Open up R/my_functions.R. Let’s add a function called NamesToAbbreviations. We’ll use this function later on to create plot labels from players’ names.
RquetteBB/R/my_functions.R
#' @title Names to abbreviations
#' @name NamesToAbbreviations
#' @description This function takes a vector of names and returns a vector of corresponding initials.
NamesToAbbreviations <- function(nameVec){
# Split on whitespace
splitNames <- stringr::str_split(
nameVec
, pattern = " "
, simplify = FALSE
)
# Grab the first letter of each name and combines
abbrevs <- purrr::map_chr(splitNames, function(splitName){
firstInitial <- substr(splitName[1], 1, 1)
lastInitial <- substr(splitName[2], 1, 1)
return(paste0(firstInitial, lastInitial))
})
# Return the character vector of abbreviations
return(abbrevs)
}
Now that we have a function in the package, we need to generate documentation for the package and install it! Do the following:
package_path <- file.path(Sys.getenv("HOME"), "repos", "RquetteBB")
setwd(package_path)
roxygen2::roxygenize(). This function will populate your NAMESPACE file and (if we had written any yet) generate documentation that users could call with ?rxygen2::roxygenize()
remotes::install_local() to install the package!remotes::install_local(package_path, force = TRUE)
rm(list=ls()) to clear your working environment. Restart your R session.library(RquetteBB) to load the package?NamesToAbbreviations to see the documentation for that functionCongratulations on creating your first R package! Lots of work still to be done, but you now have a minimum viable product (MVP) of the RquetteBB package.
As you saw when we ran ?NamesToAbbreviations, the documentation for our package is looking pretty weak right now. In this section, you’ll learn how to document your functions with the popular {roxygen2} package.
install.packages("roxygen2")
Return to R/my_functions.R. Right above your function, add some comments like those shown below. Note that the use of an apostrophe (#') is intentional and important here.
RquetteBB/R/my_functions.R
#' @title Names to abbreviations
#' @name NamesToAbbreviations
#' @description This function takes a vector of names and returns a vector of corresponding initials.
#' @param nameVec A character vector of player names
#' @importFrom purrr map_chr
#' @importFrom stringr str_split
#' @export
NamesToAbbreviations <- function(nameVec){
#...
When you’ve added this information, save my_functions.R and re-run roxygen::roxygenize(). Now try running ?NamesToAbbreviations to see your documentation!
These metadata items are called “roxygen comments”. Lines in package R files that begin with #' are analyzed by the {roxygen2} package when you run roxygen2::roxygenize().
@title: A short description of what the function does@name: The name of the function@description: A longer description of what the function does@param: An explanation of the purpose of a particular function arguments, the types of valid inputs to that function, and the consequences of passing different values to it. You should have one of these items per function argument.@importFrom: A list of functions from an external library that your particular function needs. These are of the form @importFrom package_name func1 func2 func3@export: Putting this in your roxygen comments tells roxygen to export this function to the package’s namespace (so users of your package can call it)Next, open up DESCRIPTION in a text editor of your choice. We need to add these two new dependencies! Under the Depends: line, add an Imports: blocks like this:
RquetteBB/DESCRIPTION
Package: RquetteBB
Title: Marquette Basketball Data in R
Version: 0.1
Authors@R: person("James", "Lamb", email = "jaylamb20@gmail.com", role = c("aut", "cre"))
Description: This package can be used to pull, analyze, and visualize Marquette men's basketball data.
Depends: R (>= 3.3.2)
Imports:
purrr,
stringr
License: file LICENSE
Encoding: UTF-8
LazyData: true
RoxygenNote: 7.0.2
There are four types of dependency declarations used in R package DESCRIPTION files.
library() on your package. Typically this section will just be used to enforce a minimum version of R that users of your package must have. If you put R packages here, they will automatically be loaded with library() whenever someone loads your package.library(your_package) but they will be installed when someone tries to install your package.{testthat} for writing unit tests and {rmarkdown} for building documentaiton PDFs.In software development, it’s a best practice to write your code in modular, composible “units”. Units are essentially small, related pieces of functionality. It’s also common to write tests for these bit of code, commonly referred to as “unit tests”.
The main idea behind unit tests is to try to imagine every possible type of input that a user might try to feed to your function, and then to test that the function does what it should when given that input. These tests are tedious to write at first, but they help you to make chagnes rapidly in the future…you can experiment and innovate with confidence when you know that your tests will catch any mistakes you make!
The preferred unit testing framework in R is a package called {testthat}. The easiest way to learn this package is just to use it…let’s try it out!
install.packages("testthat")
Open tests/testthat/test-my_functions.R and add the following code.
RquetteBB/tests/testthat/test-my_functions.R
context("Testing RquetteBB functions")
#===== 1. NamesToAbbreviations
test_that("NamesToAbbreviations should return the expected output for simple vectors", {
someNames <- c("Joe Buck", "Chris Berman", "Michelle Tafoya")
expect_identical(NamesToAbbreviations(someNames), c("JB", "CB", "MT"))
expect_true(class(NamesToAbbreviations(someNames)) == "numeric")
})
Save this file, the run testthat::test_dir("tests/testthat/") from the console. You should get something like this:
The text in white is the “context” you specified. The white dot signifies one passed tests. The number 1 indicates your first test failure. I you had 5 failures and 12 successes you may see somthing like .1...23...4...5...
It looks like we messed up our second test! We were testing that the result would be numeric, but this funciton will always return a character vector. Change the “numeric” to “character’” in your test file and re-run.
FWIW…the test we just wrote is called an “end-to-end” test. Basically, that means a “test where we didn’t do anything weird”. End-to-end tests are the most straightfoward, since they just involve testing that the funciton behaves as expected with the expected input type. Other tests (we may write some later) go out of their way to test behavior in particular weird situations.
Now that we’re using testthat, we should add it to the Suggests section in our DESCRIPTION file. Open up that file and add this under your Imports section.
RquetteBB/DESCRIPTION
Suggests:
testthat
Next, let’s add another function. This function, which we’ll call GetData, should go out to espn.com and scrape down player statistics for any specified season of Marquette basketball.
RquetteBB/R/my_functions.R
#' @title Get Marquette Men's Basketball Data
#' @name GetData
#' @description This function scrapes sports-reference.com and returns some
#' player statistics for Marquette's men's basketball team.
#' @param season String indicating which season you want to pull data for.
#' e.g. set to "2017" for 2016-17 season stats
#' @importFrom data.table data.table
#' @importFrom htmltab htmltab
#' @export
GetData <- function(season = "2018"){
# Build URL
queryURL <- paste0(
"https://www.sports-reference.com/cbb/schools/marquette/"
, season
, ".html"
)
# Grab tables from college basketbal reference
result <- htmltab::htmltab(
queryURL
, which = "//table[@id = 'per_game']"
)
# Convert to data.table
statDT <- data.table::as.data.table(result)
# Coerce all the columns but Player to numeric
numCols <- setdiff(names(statDT), "Player")
statDT <- cbind(
statDT[, "Player"]
, statDT[, lapply(.SD, as.numeric), .SDcols = numCols]
)
# Return the table
return(statDT)
}
Now that we’ve added a new function, we need to re-do a few thing:
roxygen2::roxygenize() to regenerate package documentaitonDESCRIPTION file
RquetteBB/tests/testthat/test-my_functions.R
context("Testing RquetteBB functions")
#===== 1. NamesToAbbreviations
# End-to-end test
test_that("NamesToAbbreviations should return the expected output for simple vectors", {
someNames <- c("Joe Buck", "Chris Berman", "Michelle Tafoya")
expect_identical(NamesToAbbreviations(someNames), c("JB", "CB", "MT"))
expect_true(class(NamesToAbbreviations(someNames)) == "character")
})
#===== 2. NamesToAbbreviations
# End-to-end test
test_that("GetData should return a data.table", {
statDT <- GetData(season = "2018")
expect_true("data.table" %in% class(statDT))
expect_true("Player" %in% names(statDT))
})
You’re doing great! We should add one more function to the package. This function, CompareOnStats(), should take a data table of season stats and create a scatterplot showing pairs of statistics for each player. For example, you could use this function to answer the question Did Marquette’s highest scorers also tend to be beter rebounders?.
You’re a pro at adding new functions now, so I won’t belabor the points about adding documentation, updating the DESCRIPTION file, etc.. Let’s jump straight into an implementation of the funciton:
RquetteBB/R/my_functions.R
#' @title Scatterplot Comparison of Player Stats
#' @name CompareOnStats
#' @description Given a data.table of Marquette men's basketball statistics, plot a bivariate
#' scatterplot of player performance on the basis of two statistics
#' @import data.table
#' @importFrom graphics plot text
#' @param statDT A data.table of Marquette season stats, pulled with \code{\link[RquetteBB]{GetData}}
#' @param x_stat A string with the name of the statistic to plot on the x-axis
#' @param y_stat A string with the name of the statistic to plot on the y-axis
#' @param season String indicating which season you pull data for. Used to create plot title
#' @param useInitials A boolean. If TRUE, player initials will be plotted. If FALSE, full names will be used.
#' @export
CompareOnStats <- function(statDT
, x_stat = "PTS"
, y_stat = "TRB"
, season
, useInitials = TRUE
){
# Create a scatterplot
plot(
x = statDT[, get(x_stat)]
, y = statDT[, get(y_stat)]
, xlab = x_stat
, ylab = y_stat
, type = "p"
, col = "white"
, main = paste0(x_stat, " vs. ", y_stat, " (", season, ")")
)
# Grab abbreviations from player names if asked
if (useInitials){
namesToPlot <- NamesToAbbreviations(statDT[, Player])
} else {
namesToPlot <- statDT[, Player]
}
# Add those abbreviations to the plot
text(
x = statDT[, get(x_stat)]
, y = statDT[, get(y_stat)]
, labels = namesToPlot
)
return(invisible(NULL))
}
Congratulations on reaching the end of this tutorial. You’ve now created your first R package! We can use it to do some cool stuff.
# Load up our package
library(RquetteBB)
# Plot the 2002-2003 data
statsDT <- GetData(season = "2003")
CompareOnStats(
statDT = statsDT
, "PTS"
, "TRB"
, season = "2002-03"
, useInitials = FALSE
)
# Plot the 2018-2019 data
statsDT <- GetData(season = "2019")
CompareOnStats(
statDT = statsDT
, "PTS"
, "TRB"
, season = "2018-19"
, useInitials = FALSE
)